knitr::opts_chunk$set(message=FALSE, warning=FALSE, eval=T, cache=F)
For today’s workshop, we’re going to use R to go through a typical bioinformatics analysis workflow. We’re going to use common bioinformatics techniques to visualize data and make beautiful figures.
The data we will analyze is breast cancer RNA-Seq data from TCGA, a popular publicly-available database for cancer-related datasets. The goal of the analysis will be to identify genes that show significant changes in expression between normal and tumor tissues, followed by identifying the pathways they are associated with. After importing the data and performing some data pre-processing, we will carry out differntial expression analysis and gene set enrichment analysis.
Main steps in today’s workshop:
Make sure to have the following packages installed for this workshop:
BiobasedplyrDESeq2fgseaggplot2msigdbrfgseaAn expression set is a data object consisting of three entities: the expression matrix (exprs), the phenotye data (pData), and the feature data (fData).
We read in the RDS file included in this repo. It corresponds to a subset of samples from a gene expression dataset of breast cancer (BRCA) primary tissue samples from the TCGA project.
library(Biobase)
library(magrittr)
library(dplyr)
library(ggplot2)
library(Biobase)
library(ggfortify)
library(plotly)
brca <- readRDS("data/TCGA-BRCA.rds")
# dimensions of the expression data
dim(brca)
## Features Samples
## 36812 1222
# dimensions of the gene annotation
dim(fData(brca))
## [1] 36812 4
# first few rows of gene annotations
head(fData(brca)[,c("ensembl_transcript_id", "ensembl_gene_id", "hgnc_symbol")])
## ensembl_transcript_id ensembl_gene_id hgnc_symbol
## TSPAN6 ENSG00000000003.13 ENSG00000000003 TSPAN6
## TNMD ENSG00000000005.5 ENSG00000000005 TNMD
## DPM1 ENSG00000000419.11 ENSG00000000419 DPM1
## SCYL3 ENSG00000000457.12 ENSG00000000457 SCYL3
## C1orf112 ENSG00000000460.15 ENSG00000000460 C1orf112
## FGR ENSG00000000938.11 ENSG00000000938 FGR
# dimensions of the phenotypic annotation
dim(pData(brca))
## [1] 1222 65
# first few rows of phenotype
head(pData(brca)[,c("patient_id", "sample_type", "tumor_subtype")])
## patient_id sample_type tumor_subtype
## TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A8-A085 Primary Tumor LumB
## TCGA-A2-A0SY-01A-31R-A084-07 TCGA-A2-A0SY Primary Tumor LumA
## TCGA-AR-A24Z-01A-11R-A169-07 TCGA-AR-A24Z Primary Tumor LumB
## TCGA-D8-A1XU-01A-11R-A14M-07 TCGA-D8-A1XU Primary Tumor LumA
## TCGA-A1-A0SN-01A-11R-A144-07 TCGA-A1-A0SN Primary Tumor Her2
## TCGA-D8-A73W-01A-22R-A352-07 TCGA-D8-A73W Primary Tumor LumB
# how many of each sample type?
table(pData(brca)$sample_type)
##
## Metastatic Primary Tumor Solid Tissue Normal
## 7 1102 113
# how many tumor subtypes?
table(pData(brca)$tumor_subtype)
##
## Basal Her2 LumA LumB Normal
## 169 209 510 198 16
exprs(brca) <- log2(exprs(brca) + 1)
exprs(brca)[1:5,1:5]
## TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A2-A0SY-01A-31R-A084-07
## TSPAN6 13.975579 10.981567
## TNMD 1.584963 6.189825
## DPM1 11.156715 10.822571
## SCYL3 10.590587 10.946906
## C1orf112 9.519636 9.339850
## TCGA-AR-A24Z-01A-11R-A169-07 TCGA-D8-A1XU-01A-11R-A14M-07
## TSPAN6 12.302353 12.463013
## TNMD 4.459432 2.807355
## DPM1 11.945444 12.266494
## SCYL3 10.611025 11.149747
## C1orf112 9.388017 9.400879
## TCGA-A1-A0SN-01A-11R-A144-07
## TSPAN6 9.714246
## TNMD 1.000000
## DPM1 12.419960
## SCYL3 11.136350
## C1orf112 9.884171
Start by ranking genes based on their variation across samples
row.var <- sort(apply(exprs(brca), 1, var), decreasing=TRUE)
head(row.var)
## CLEC3A SCGB2A2 CPB1 TFF1 SCGB1D2 KCNJ3
## 29.73892 25.49291 24.59669 21.00591 20.25785 19.56774
To save time, we’ll run PCA on the top 2500 most variable genes
df <- brca[names(row.var)[1:2500]] %>%
exprs() %>%
t() %>%
data.frame()
pca <- prcomp(df)
pca.summary <- summary(pca)
pca.summary$importance[,1:5]
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 47.42767 40.65232 29.32491 23.58570 20.28785
## Proportion of Variance 0.14934 0.10972 0.05709 0.03693 0.02733
## Cumulative Proportion 0.14934 0.25906 0.31615 0.35309 0.38041
df$tumor_subtype <- brca$tumor_subtype
autoplot(pca, data=df, colour='tumor_subtype')
df.pca <- cbind(pca$x[,c(1:3)], brca$tumor_subtype) %>%
as.data.frame() %>%
set_colnames(c("PC1", "PC2", "PC3", "tumor_subtype"))
head(df.pca)
## PC1 PC2
## TCGA-A8-A085-01A-11R-A00Z-07 -89.9895594300379 14.4728151158205
## TCGA-A2-A0SY-01A-31R-A084-07 8.62013654000677 -39.5325958432916
## TCGA-AR-A24Z-01A-11R-A169-07 -41.8816734638114 -26.6304385642973
## TCGA-D8-A1XU-01A-11R-A14M-07 -22.8004029255938 -38.3033437021283
## TCGA-A1-A0SN-01A-11R-A144-07 -26.3108798601652 1.88579216258153
## TCGA-D8-A73W-01A-22R-A352-07 -46.7262990133077 6.82995550570086
## PC3 tumor_subtype
## TCGA-A8-A085-01A-11R-A00Z-07 -38.4190093513565 LumB
## TCGA-A2-A0SY-01A-31R-A084-07 7.54869728087252 LumA
## TCGA-AR-A24Z-01A-11R-A169-07 11.3252895436753 LumB
## TCGA-D8-A1XU-01A-11R-A14M-07 24.3111755582573 LumA
## TCGA-A1-A0SN-01A-11R-A144-07 21.69949616053 Her2
## TCGA-D8-A73W-01A-22R-A352-07 -12.9151476451899 LumB
p <- plot_ly(df.pca,
x = ~PC1,
y = ~PC2,
z = ~PC3,
type="scatter3d",
mode = "markers",
color = ~tumor_subtype,
marker = list(size = 3))
p